In [2]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
df = pd.read_csv(r'C:\Users\Sandeep Immadi\Downloads\Education+-+Post+12th+Standard.csv')
df.head()
Out[3]:
Names Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
0 Abilene Christian University 1660 1232 721 23 52 2885 537 7440 3300 450 2200 70 78 18.1 12 7041 60
1 Adelphi University 2186 1924 512 16 29 2683 1227 12280 6450 750 1500 29 30 12.2 16 10527 56
2 Adrian College 1428 1097 336 22 50 1036 99 11250 3750 400 1165 53 66 12.9 30 8735 54
3 Agnes Scott College 417 349 137 60 89 510 63 12960 5450 450 875 92 97 7.7 37 19016 59
4 Alaska Pacific University 193 146 55 16 44 249 869 7560 4120 800 1500 76 72 11.9 2 10922 15
In [3]:
df.describe().T
Out[3]:
count mean std min 25% 50% 75% max
Apps 777.0 3001.638353 3870.201484 81.0 776.0 1558.0 3624.0 48094.0
Accept 777.0 2018.804376 2451.113971 72.0 604.0 1110.0 2424.0 26330.0
Enroll 777.0 779.972973 929.176190 35.0 242.0 434.0 902.0 6392.0
Top10perc 777.0 27.558559 17.640364 1.0 15.0 23.0 35.0 96.0
Top25perc 777.0 55.796654 19.804778 9.0 41.0 54.0 69.0 100.0
F.Undergrad 777.0 3699.907336 4850.420531 139.0 992.0 1707.0 4005.0 31643.0
P.Undergrad 777.0 855.298584 1522.431887 1.0 95.0 353.0 967.0 21836.0
Outstate 777.0 10440.669241 4023.016484 2340.0 7320.0 9990.0 12925.0 21700.0
Room.Board 777.0 4357.526384 1096.696416 1780.0 3597.0 4200.0 5050.0 8124.0
Books 777.0 549.380952 165.105360 96.0 470.0 500.0 600.0 2340.0
Personal 777.0 1340.642214 677.071454 250.0 850.0 1200.0 1700.0 6800.0
PhD 777.0 72.660232 16.328155 8.0 62.0 75.0 85.0 103.0
Terminal 777.0 79.702703 14.722359 24.0 71.0 82.0 92.0 100.0
S.F.Ratio 777.0 14.089704 3.958349 2.5 11.5 13.6 16.5 39.8
perc.alumni 777.0 22.743887 12.391801 0.0 13.0 21.0 31.0 64.0
Expend 777.0 9660.171171 5221.768440 3186.0 6751.0 8377.0 10830.0 56233.0
Grad.Rate 777.0 65.463320 17.177710 10.0 53.0 65.0 78.0 118.0
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 777 entries, 0 to 776
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Names        777 non-null    object 
 1   Apps         777 non-null    int64  
 2   Accept       777 non-null    int64  
 3   Enroll       777 non-null    int64  
 4   Top10perc    777 non-null    int64  
 5   Top25perc    777 non-null    int64  
 6   F.Undergrad  777 non-null    int64  
 7   P.Undergrad  777 non-null    int64  
 8   Outstate     777 non-null    int64  
 9   Room.Board   777 non-null    int64  
 10  Books        777 non-null    int64  
 11  Personal     777 non-null    int64  
 12  PhD          777 non-null    int64  
 13  Terminal     777 non-null    int64  
 14  S.F.Ratio    777 non-null    float64
 15  perc.alumni  777 non-null    int64  
 16  Expend       777 non-null    int64  
 17  Grad.Rate    777 non-null    int64  
dtypes: float64(1), int64(16), object(1)
memory usage: 109.4+ KB
In [5]:
df.isnull().sum()
Out[5]:
Names          0
Apps           0
Accept         0
Enroll         0
Top10perc      0
Top25perc      0
F.Undergrad    0
P.Undergrad    0
Outstate       0
Room.Board     0
Books          0
Personal       0
PhD            0
Terminal       0
S.F.Ratio      0
perc.alumni    0
Expend         0
Grad.Rate      0
dtype: int64
In [6]:
dups = df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
Number of duplicate rows = 0
In [4]:
data_new = df.drop(['Names'], axis =1)
data_new.head()
Out[4]:
Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
0 1660 1232 721 23 52 2885 537 7440 3300 450 2200 70 78 18.1 12 7041 60
1 2186 1924 512 16 29 2683 1227 12280 6450 750 1500 29 30 12.2 16 10527 56
2 1428 1097 336 22 50 1036 99 11250 3750 400 1165 53 66 12.9 30 8735 54
3 417 349 137 60 89 510 63 12960 5450 450 875 92 97 7.7 37 19016 59
4 193 146 55 16 44 249 869 7560 4120 800 1500 76 72 11.9 2 10922 15
In [10]:
data_new.shape
Out[10]:
(777, 17)
In [4]:
columns_names=data_new.columns.tolist()
print("Columns names:")
print(columns_names)
Columns names:
['Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board', 'Books', 'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni', 'Expend', 'Grad.Rate']
In [11]:
data_new.skew()
Out[11]:
Apps           3.723750
Accept         3.417727
Enroll         2.690465
Top10perc      1.413217
Top25perc      0.259340
F.Undergrad    2.610458
P.Undergrad    5.692353
Outstate       0.509278
Room.Board     0.477356
Books          3.485025
Personal       1.742497
PhD           -0.768170
Terminal      -0.816542
S.F.Ratio      0.667435
perc.alumni    0.606891
Expend         3.459322
Grad.Rate     -0.113777
dtype: float64
In [12]:
data_new.kurt()
Out[12]:
Apps           26.774253
Accept         18.938099
Enroll          8.831544
Top10perc       2.208065
Top25perc      -0.564121
F.Undergrad     7.696586
P.Undergrad    55.034518
Outstate       -0.413832
Room.Board     -0.187553
Books          28.333097
Personal        7.124017
PhD             0.564773
Terminal        0.242019
S.F.Ratio       2.561209
perc.alumni    -0.096807
Expend         18.771500
Grad.Rate      -0.205226
dtype: float64
In [24]:
data_new.hist(figsize=(20,30))
Out[24]:
array([[<AxesSubplot:title={'center':'Apps'}>,
        <AxesSubplot:title={'center':'Accept'}>,
        <AxesSubplot:title={'center':'Enroll'}>,
        <AxesSubplot:title={'center':'Top10perc'}>],
       [<AxesSubplot:title={'center':'Top25perc'}>,
        <AxesSubplot:title={'center':'F.Undergrad'}>,
        <AxesSubplot:title={'center':'P.Undergrad'}>,
        <AxesSubplot:title={'center':'Outstate'}>],
       [<AxesSubplot:title={'center':'Room.Board'}>,
        <AxesSubplot:title={'center':'Books'}>,
        <AxesSubplot:title={'center':'Personal'}>,
        <AxesSubplot:title={'center':'PhD'}>],
       [<AxesSubplot:title={'center':'Terminal'}>,
        <AxesSubplot:title={'center':'S.F.Ratio'}>,
        <AxesSubplot:title={'center':'perc.alumni'}>,
        <AxesSubplot:title={'center':'Expend'}>],
       [<AxesSubplot:title={'center':'Grad.Rate'}>, <AxesSubplot:>,
        <AxesSubplot:>, <AxesSubplot:>]], dtype=object)
In [30]:
plt.figure(figsize = (20,8))
data_new.boxplot()
Out[30]:
<AxesSubplot:>
In [8]:
plt.figure(figsize=(15,10))
sns.heatmap(df.corr(),annot=True,fmt=".2f");
In [5]:
df.corr()
Out[5]:
Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
Apps 1.000000 0.943451 0.846822 0.338834 0.351640 0.814491 0.398264 0.050159 0.164939 0.132559 0.178731 0.390697 0.369491 0.095633 -0.090226 0.259592 0.146755
Accept 0.943451 1.000000 0.911637 0.192447 0.247476 0.874223 0.441271 -0.025755 0.090899 0.113525 0.200989 0.355758 0.337583 0.176229 -0.159990 0.124717 0.067313
Enroll 0.846822 0.911637 1.000000 0.181294 0.226745 0.964640 0.513069 -0.155477 -0.040232 0.112711 0.280929 0.331469 0.308274 0.237271 -0.180794 0.064169 -0.022341
Top10perc 0.338834 0.192447 0.181294 1.000000 0.891995 0.141289 -0.105356 0.562331 0.371480 0.118858 -0.093316 0.531828 0.491135 -0.384875 0.455485 0.660913 0.494989
Top25perc 0.351640 0.247476 0.226745 0.891995 1.000000 0.199445 -0.053577 0.489394 0.331490 0.115527 -0.080810 0.545862 0.524749 -0.294629 0.417864 0.527447 0.477281
F.Undergrad 0.814491 0.874223 0.964640 0.141289 0.199445 1.000000 0.570512 -0.215742 -0.068890 0.115550 0.317200 0.318337 0.300019 0.279703 -0.229462 0.018652 -0.078773
P.Undergrad 0.398264 0.441271 0.513069 -0.105356 -0.053577 0.570512 1.000000 -0.253512 -0.061326 0.081200 0.319882 0.149114 0.141904 0.232531 -0.280792 -0.083568 -0.257001
Outstate 0.050159 -0.025755 -0.155477 0.562331 0.489394 -0.215742 -0.253512 1.000000 0.654256 0.038855 -0.299087 0.382982 0.407983 -0.554821 0.566262 0.672779 0.571290
Room.Board 0.164939 0.090899 -0.040232 0.371480 0.331490 -0.068890 -0.061326 0.654256 1.000000 0.127963 -0.199428 0.329202 0.374540 -0.362628 0.272363 0.501739 0.424942
Books 0.132559 0.113525 0.112711 0.118858 0.115527 0.115550 0.081200 0.038855 0.127963 1.000000 0.179295 0.026906 0.099955 -0.031929 -0.040208 0.112409 0.001061
Personal 0.178731 0.200989 0.280929 -0.093316 -0.080810 0.317200 0.319882 -0.299087 -0.199428 0.179295 1.000000 -0.010936 -0.030613 0.136345 -0.285968 -0.097892 -0.269344
PhD 0.390697 0.355758 0.331469 0.531828 0.545862 0.318337 0.149114 0.382982 0.329202 0.026906 -0.010936 1.000000 0.849587 -0.130530 0.249009 0.432762 0.305038
Terminal 0.369491 0.337583 0.308274 0.491135 0.524749 0.300019 0.141904 0.407983 0.374540 0.099955 -0.030613 0.849587 1.000000 -0.160104 0.267130 0.438799 0.289527
S.F.Ratio 0.095633 0.176229 0.237271 -0.384875 -0.294629 0.279703 0.232531 -0.554821 -0.362628 -0.031929 0.136345 -0.130530 -0.160104 1.000000 -0.402929 -0.583832 -0.306710
perc.alumni -0.090226 -0.159990 -0.180794 0.455485 0.417864 -0.229462 -0.280792 0.566262 0.272363 -0.040208 -0.285968 0.249009 0.267130 -0.402929 1.000000 0.417712 0.490898
Expend 0.259592 0.124717 0.064169 0.660913 0.527447 0.018652 -0.083568 0.672779 0.501739 0.112409 -0.097892 0.432762 0.438799 -0.583832 0.417712 1.000000 0.390343
Grad.Rate 0.146755 0.067313 -0.022341 0.494989 0.477281 -0.078773 -0.257001 0.571290 0.424942 0.001061 -0.269344 0.305038 0.289527 -0.306710 0.490898 0.390343 1.000000
In [24]:
sns.pairplot(df,diag_kind='kde')
plt.show()
In [5]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(data_new)
X_std
Out[5]:
array([[-3.46881819e-01, -3.21205453e-01, -6.35089011e-02, ...,
        -8.67574189e-01, -5.01910084e-01, -3.18251941e-01],
       [-2.10884040e-01, -3.87029908e-02, -2.88584214e-01, ...,
        -5.44572203e-01,  1.66109850e-01, -5.51261842e-01],
       [-4.06865631e-01, -3.76317928e-01, -4.78121319e-01, ...,
         5.85934748e-01, -1.77289956e-01, -6.67766793e-01],
       ...,
       [-2.33895071e-01, -4.23771558e-02, -9.15087008e-02, ...,
        -2.21570217e-01, -2.56241250e-01, -9.59029170e-01],
       [ 1.99171118e+00,  1.77256262e-01,  5.78332661e-01, ...,
         2.12019418e+00,  5.88797079e+00,  1.95359460e+00],
       [-3.26765760e-03, -6.68715889e-02, -9.58163623e-02, ...,
         4.24433755e-01, -9.87115613e-01,  1.95359460e+00]])
In [6]:
df_s = pd.DataFrame(X_std,columns = data_new.columns)
df_s.head()
Out[6]:
Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
0 -0.346882 -0.321205 -0.063509 -0.258583 -0.191827 -0.168116 -0.209207 -0.746356 -0.964905 -0.602312 1.270045 -0.163028 -0.115729 1.013776 -0.867574 -0.501910 -0.318252
1 -0.210884 -0.038703 -0.288584 -0.655656 -1.353911 -0.209788 0.244307 0.457496 1.909208 1.215880 0.235515 -2.675646 -3.378176 -0.477704 -0.544572 0.166110 -0.551262
2 -0.406866 -0.376318 -0.478121 -0.315307 -0.292878 -0.549565 -0.497090 0.201305 -0.554317 -0.905344 -0.259582 -1.204845 -0.931341 -0.300749 0.585935 -0.177290 -0.667767
3 -0.668261 -0.681682 -0.692427 1.840231 1.677612 -0.658079 -0.520752 0.626633 0.996791 -0.602312 -0.688173 1.185206 1.175657 -1.615274 1.151188 1.792851 -0.376504
4 -0.726176 -0.764555 -0.780735 -0.655656 -0.596031 -0.711924 0.009005 -0.716508 -0.216723 1.518912 0.235515 0.204672 -0.523535 -0.553542 -1.675079 0.241803 -2.939613
In [7]:
df_s.describe()
Out[7]:
Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
count 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02 7.770000e+02
mean 6.355797e-17 6.774575e-17 -5.249269e-17 -2.753232e-17 -1.546739e-16 -1.661405e-16 -3.029180e-17 6.515595e-17 3.570717e-16 -2.192583e-16 4.765243e-17 5.954768e-17 -4.481615e-16 -2.057556e-17 -6.022638e-17 1.213101e-16 3.886495e-16
std 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00 1.000644e+00
min -7.551337e-01 -7.947645e-01 -8.022728e-01 -1.506526e+00 -2.364419e+00 -7.346169e-01 -5.615022e-01 -2.014878e+00 -2.351778e+00 -2.747779e+00 -1.611860e+00 -3.962596e+00 -3.785982e+00 -2.929799e+00 -1.836580e+00 -1.240641e+00 -3.230876e+00
25% -5.754408e-01 -5.775805e-01 -5.793514e-01 -7.123803e-01 -7.476067e-01 -5.586426e-01 -4.997191e-01 -7.762035e-01 -6.939170e-01 -4.810994e-01 -7.251203e-01 -6.532948e-01 -5.915023e-01 -6.546598e-01 -7.868237e-01 -5.574826e-01 -7.260193e-01
50% -3.732540e-01 -3.710108e-01 -3.725836e-01 -2.585828e-01 -9.077663e-02 -4.111378e-01 -3.301442e-01 -1.120949e-01 -1.437297e-01 -2.992802e-01 -2.078552e-01 1.433889e-01 1.561419e-01 -1.237939e-01 -1.408197e-01 -2.458933e-01 -2.698956e-02
75% 1.609122e-01 1.654173e-01 1.314128e-01 4.221134e-01 6.671042e-01 6.294077e-02 7.341765e-02 6.179271e-01 6.318245e-01 3.067838e-01 5.310950e-01 7.562224e-01 8.358184e-01 6.093067e-01 6.666852e-01 2.241735e-01 7.302926e-01
max 1.165867e+01 9.924816e+00 6.043678e+00 3.882319e+00 2.233391e+00 5.764674e+00 1.378992e+01 2.800531e+00 3.436593e+00 1.085230e+01 8.068387e+00 1.859323e+00 1.379560e+00 6.499390e+00 3.331452e+00 8.924721e+00 3.060392e+00
In [12]:
plt.figure(figsize = (20,8))
df_s.boxplot()
Out[12]:
<AxesSubplot:>
In [16]:
plt.figure(figsize=(5,5))
sns.boxplot(data=df['Top25perc']);
In [8]:
mean_vec = np.mean(X_std, axis=0)
cov_mat = (X_std - mean_vec).T.dot((X_std - mean_vec)) / (X_std.shape[0]-1)
print('Covariance matrix \n%s' %cov_mat)
Covariance matrix 
[[ 1.00128866  0.94466636  0.84791332  0.33927032  0.35209304  0.81554018
   0.3987775   0.05022367  0.16515151  0.13272942  0.17896117  0.39120081
   0.36996762  0.09575627 -0.09034216  0.2599265   0.14694372]
 [ 0.94466636  1.00128866  0.91281145  0.19269493  0.24779465  0.87534985
   0.44183938 -0.02578774  0.09101577  0.11367165  0.20124767  0.35621633
   0.3380184   0.17645611 -0.16019604  0.12487773  0.06739929]
 [ 0.84791332  0.91281145  1.00128866  0.18152715  0.2270373   0.96588274
   0.51372977 -0.1556777  -0.04028353  0.11285614  0.28129148  0.33189629
   0.30867133  0.23757707 -0.18102711  0.06425192 -0.02236983]
 [ 0.33927032  0.19269493  0.18152715  1.00128866  0.89314445  0.1414708
  -0.10549205  0.5630552   0.37195909  0.1190116  -0.09343665  0.53251337
   0.49176793 -0.38537048  0.45607223  0.6617651   0.49562711]
 [ 0.35209304  0.24779465  0.2270373   0.89314445  1.00128866  0.19970167
  -0.05364569  0.49002449  0.33191707  0.115676   -0.08091441  0.54656564
   0.52542506 -0.29500852  0.41840277  0.52812713  0.47789622]
 [ 0.81554018  0.87534985  0.96588274  0.1414708   0.19970167  1.00128866
   0.57124738 -0.21602002 -0.06897917  0.11569867  0.31760831  0.3187472
   0.30040557  0.28006379 -0.22975792  0.01867565 -0.07887464]
 [ 0.3987775   0.44183938  0.51372977 -0.10549205 -0.05364569  0.57124738
   1.00128866 -0.25383901 -0.06140453  0.08130416  0.32029384  0.14930637
   0.14208644  0.23283016 -0.28115421 -0.08367612 -0.25733218]
 [ 0.05022367 -0.02578774 -0.1556777   0.5630552   0.49002449 -0.21602002
  -0.25383901  1.00128866  0.65509951  0.03890494 -0.29947232  0.38347594
   0.40850895 -0.55553625  0.56699214  0.6736456   0.57202613]
 [ 0.16515151  0.09101577 -0.04028353  0.37195909  0.33191707 -0.06897917
  -0.06140453  0.65509951  1.00128866  0.12812787 -0.19968518  0.32962651
   0.3750222  -0.36309504  0.27271444  0.50238599  0.42548915]
 [ 0.13272942  0.11367165  0.11285614  0.1190116   0.115676    0.11569867
   0.08130416  0.03890494  0.12812787  1.00128866  0.17952581  0.0269404
   0.10008351 -0.03197042 -0.04025955  0.11255393  0.00106226]
 [ 0.17896117  0.20124767  0.28129148 -0.09343665 -0.08091441  0.31760831
   0.32029384 -0.29947232 -0.19968518  0.17952581  1.00128866 -0.01094989
  -0.03065256  0.13652054 -0.2863366  -0.09801804 -0.26969106]
 [ 0.39120081  0.35621633  0.33189629  0.53251337  0.54656564  0.3187472
   0.14930637  0.38347594  0.32962651  0.0269404  -0.01094989  1.00128866
   0.85068186 -0.13069832  0.24932955  0.43331936  0.30543094]
 [ 0.36996762  0.3380184   0.30867133  0.49176793  0.52542506  0.30040557
   0.14208644  0.40850895  0.3750222   0.10008351 -0.03065256  0.85068186
   1.00128866 -0.16031027  0.26747453  0.43936469  0.28990033]
 [ 0.09575627  0.17645611  0.23757707 -0.38537048 -0.29500852  0.28006379
   0.23283016 -0.55553625 -0.36309504 -0.03197042  0.13652054 -0.13069832
  -0.16031027  1.00128866 -0.4034484  -0.5845844  -0.30710565]
 [-0.09034216 -0.16019604 -0.18102711  0.45607223  0.41840277 -0.22975792
  -0.28115421  0.56699214  0.27271444 -0.04025955 -0.2863366   0.24932955
   0.26747453 -0.4034484   1.00128866  0.41825001  0.49153016]
 [ 0.2599265   0.12487773  0.06425192  0.6617651   0.52812713  0.01867565
  -0.08367612  0.6736456   0.50238599  0.11255393 -0.09801804  0.43331936
   0.43936469 -0.5845844   0.41825001  1.00128866  0.39084571]
 [ 0.14694372  0.06739929 -0.02236983  0.49562711  0.47789622 -0.07887464
  -0.25733218  0.57202613  0.42548915  0.00106226 -0.26969106  0.30543094
   0.28990033 -0.30710565  0.49153016  0.39084571  1.00128866]]
In [9]:
cov_mat = np.cov(X_std.T)

eig_vals, eig_vecs = np.linalg.eig(cov_mat)
print('\nNumber of Eigenvectors : ', len(eig_vecs))
print('Eigenvectors \n%s' %eig_vecs)
print('\nEigenvalues \n%s' %eig_vals)
Number of Eigenvectors :  17
Eigenvectors 
[[-2.48765602e-01  3.31598227e-01  6.30921033e-02 -2.81310530e-01
   5.74140964e-03  1.62374420e-02  4.24863486e-02  1.03090398e-01
   9.02270802e-02 -5.25098025e-02  3.58970400e-01 -4.59139498e-01
   4.30462074e-02 -1.33405806e-01  8.06328039e-02 -5.95830975e-01
   2.40709086e-02]
 [-2.07601502e-01  3.72116750e-01  1.01249056e-01 -2.67817346e-01
   5.57860920e-02 -7.53468452e-03  1.29497196e-02  5.62709623e-02
   1.77864814e-01 -4.11400844e-02 -5.43427250e-01  5.18568789e-01
  -5.84055850e-02  1.45497511e-01  3.34674281e-02 -2.92642398e-01
  -1.45102446e-01]
 [-1.76303592e-01  4.03724252e-01  8.29855709e-02 -1.61826771e-01
  -5.56936353e-02  4.25579803e-02  2.76928937e-02 -5.86623552e-02
   1.28560713e-01 -3.44879147e-02  6.09651110e-01  4.04318439e-01
  -6.93988831e-02 -2.95896092e-02 -8.56967180e-02  4.44638207e-01
   1.11431545e-02]
 [-3.54273947e-01 -8.24118211e-02 -3.50555339e-02  5.15472524e-02
  -3.95434345e-01  5.26927980e-02  1.61332069e-01  1.22678028e-01
  -3.41099863e-01 -6.40257785e-02 -1.44986329e-01  1.48738723e-01
  -8.10481404e-03 -6.97722522e-01 -1.07828189e-01 -1.02303616e-03
   3.85543001e-02]
 [-3.44001279e-01 -4.47786551e-02  2.41479376e-02  1.09766541e-01
  -4.26533594e-01 -3.30915896e-02  1.18485556e-01  1.02491967e-01
  -4.03711989e-01 -1.45492289e-02  8.03478445e-02 -5.18683400e-02
  -2.73128469e-01  6.17274818e-01  1.51742110e-01 -2.18838802e-02
  -8.93515563e-02]
 [-1.54640962e-01  4.17673774e-01  6.13929764e-02 -1.00412335e-01
  -4.34543659e-02  4.34542349e-02  2.50763629e-02 -7.88896442e-02
   5.94419181e-02 -2.08471834e-02 -4.14705279e-01 -5.60363054e-01
  -8.11578181e-02 -9.91640992e-03 -5.63728817e-02  5.23622267e-01
   5.61767721e-02]
 [-2.64425045e-02  3.15087830e-01 -1.39681716e-01  1.58558487e-01
   3.02385408e-01  1.91198583e-01 -6.10423460e-02 -5.70783816e-01
  -5.60672902e-01  2.23105808e-01  9.01788964e-03  5.27313042e-02
   1.00693324e-01 -2.09515982e-02  1.92857500e-02 -1.25997650e-01
  -6.35360730e-02]
 [-2.94736419e-01 -2.49643522e-01 -4.65988731e-02 -1.31291364e-01
   2.22532003e-01  3.00003910e-02 -1.08528966e-01 -9.84599754e-03
   4.57332880e-03 -1.86675363e-01  5.08995918e-02 -1.01594830e-01
   1.43220673e-01 -3.83544794e-02 -3.40115407e-02  1.41856014e-01
  -8.23443779e-01]
 [-2.49030449e-01 -1.37808883e-01 -1.48967389e-01 -1.84995991e-01
   5.60919470e-01 -1.62755446e-01 -2.09744235e-01  2.21453442e-01
  -2.75022548e-01 -2.98324237e-01  1.14639620e-03  2.59293381e-02
  -3.59321731e-01 -3.40197083e-03 -5.84289756e-02  6.97485854e-02
   3.54559731e-01]
 [-6.47575181e-02  5.63418434e-02 -6.77411649e-01 -8.70892205e-02
  -1.27288825e-01 -6.41054950e-01  1.49692034e-01 -2.13293009e-01
   1.33663353e-01  8.20292186e-02  7.72631963e-04 -2.88282896e-03
   3.19400370e-02  9.43887925e-03 -6.68494643e-02 -1.14379958e-02
  -2.81593679e-02]
 [ 4.25285386e-02  2.19929218e-01 -4.99721120e-01  2.30710568e-01
  -2.22311021e-01  3.31398003e-01 -6.33790064e-01  2.32660840e-01
   9.44688900e-02 -1.36027616e-01 -1.11433396e-03  1.28904022e-02
  -1.85784733e-02  3.09001353e-03  2.75286207e-02 -3.94547417e-02
  -3.92640266e-02]
 [-3.18312875e-01  5.83113174e-02  1.27028371e-01  5.34724832e-01
   1.40166326e-01 -9.12555212e-02  1.09641298e-03  7.70400002e-02
   1.85181525e-01  1.23452200e-01  1.38133366e-02 -2.98075465e-02
   4.03723253e-02  1.12055599e-01 -6.91126145e-01 -1.27696382e-01
   2.32224316e-02]
 [-3.17056016e-01  4.64294477e-02  6.60375454e-02  5.19443019e-01
   2.04719730e-01 -1.54927646e-01  2.84770105e-02  1.21613297e-02
   2.54938198e-01  8.85784627e-02  6.20932749e-03  2.70759809e-02
  -5.89734026e-02 -1.58909651e-01  6.71008607e-01  5.83134662e-02
   1.64850420e-02]
 [ 1.76957895e-01  2.46665277e-01  2.89848401e-01  1.61189487e-01
  -7.93882496e-02 -4.87045875e-01 -2.19259358e-01  8.36048735e-02
  -2.74544380e-01 -4.72045249e-01 -2.22215182e-03  2.12476294e-02
   4.45000727e-01  2.08991284e-02  4.13740967e-02  1.77152700e-02
  -1.10262122e-02]
 [-2.05082369e-01 -2.46595274e-01  1.46989274e-01 -1.73142230e-02
  -2.16297411e-01  4.73400144e-02 -2.43321156e-01 -6.78523654e-01
   2.55334907e-01 -4.22999706e-01 -1.91869743e-02 -3.33406243e-03
  -1.30727978e-01  8.41789410e-03 -2.71542091e-02 -1.04088088e-01
   1.82660654e-01]
 [-3.18908750e-01 -1.31689865e-01 -2.26743985e-01 -7.92734946e-02
   7.59581203e-02  2.98118619e-01  2.26584481e-01  5.41593771e-02
   4.91388809e-02 -1.32286331e-01 -3.53098218e-02  4.38803230e-02
   6.92088870e-01  2.27742017e-01  7.31225166e-02  9.37464497e-02
   3.25982295e-01]
 [-2.52315654e-01 -1.69240532e-01  2.08064649e-01 -2.69129066e-01
  -1.09267913e-01 -2.16163313e-01 -5.59943937e-01  5.33553891e-03
  -4.19043052e-02  5.90271067e-01 -1.30710024e-02  5.00844705e-03
   2.19839000e-01  3.39433604e-03  3.64767385e-02  6.91969778e-02
   1.22106697e-01]]

Eigenvalues 
[5.45052162 4.48360686 1.17466761 1.00820573 0.93423123 0.84849117
 0.6057878  0.58787222 0.53061262 0.4043029  0.02302787 0.03672545
 0.31344588 0.08802464 0.1439785  0.16779415 0.22061096]
In [10]:
plt.figure(figsize=(15,10))
sns.heatmap(cov_mat, vmax=1, square=True,annot=True,cmap='cubehelix')

plt.title('Correlation between different features')
Out[10]:
Text(0.5, 1.0, 'Correlation between different features')
In [11]:
eigenvec_df=pd.DataFrame(eig_vecs)
eigenvec_df.columns = ['eigvec_'+str(i+1) for i in range(0,len(eig_vecs))]
eigenvec_df.T
Out[11]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
eigvec_1 -0.248766 -0.207602 -0.176304 -0.354274 -0.344001 -0.154641 -0.026443 -0.294736 -0.249030 -0.064758 0.042529 -0.318313 -0.317056 0.176958 -0.205082 -0.318909 -0.252316
eigvec_2 0.331598 0.372117 0.403724 -0.082412 -0.044779 0.417674 0.315088 -0.249644 -0.137809 0.056342 0.219929 0.058311 0.046429 0.246665 -0.246595 -0.131690 -0.169241
eigvec_3 0.063092 0.101249 0.082986 -0.035056 0.024148 0.061393 -0.139682 -0.046599 -0.148967 -0.677412 -0.499721 0.127028 0.066038 0.289848 0.146989 -0.226744 0.208065
eigvec_4 -0.281311 -0.267817 -0.161827 0.051547 0.109767 -0.100412 0.158558 -0.131291 -0.184996 -0.087089 0.230711 0.534725 0.519443 0.161189 -0.017314 -0.079273 -0.269129
eigvec_5 0.005741 0.055786 -0.055694 -0.395434 -0.426534 -0.043454 0.302385 0.222532 0.560919 -0.127289 -0.222311 0.140166 0.204720 -0.079388 -0.216297 0.075958 -0.109268
eigvec_6 0.016237 -0.007535 0.042558 0.052693 -0.033092 0.043454 0.191199 0.030000 -0.162755 -0.641055 0.331398 -0.091256 -0.154928 -0.487046 0.047340 0.298119 -0.216163
eigvec_7 0.042486 0.012950 0.027693 0.161332 0.118486 0.025076 -0.061042 -0.108529 -0.209744 0.149692 -0.633790 0.001096 0.028477 -0.219259 -0.243321 0.226584 -0.559944
eigvec_8 0.103090 0.056271 -0.058662 0.122678 0.102492 -0.078890 -0.570784 -0.009846 0.221453 -0.213293 0.232661 0.077040 0.012161 0.083605 -0.678524 0.054159 0.005336
eigvec_9 0.090227 0.177865 0.128561 -0.341100 -0.403712 0.059442 -0.560673 0.004573 -0.275023 0.133663 0.094469 0.185182 0.254938 -0.274544 0.255335 0.049139 -0.041904
eigvec_10 -0.052510 -0.041140 -0.034488 -0.064026 -0.014549 -0.020847 0.223106 -0.186675 -0.298324 0.082029 -0.136028 0.123452 0.088578 -0.472045 -0.423000 -0.132286 0.590271
eigvec_11 0.358970 -0.543427 0.609651 -0.144986 0.080348 -0.414705 0.009018 0.050900 0.001146 0.000773 -0.001114 0.013813 0.006209 -0.002222 -0.019187 -0.035310 -0.013071
eigvec_12 -0.459139 0.518569 0.404318 0.148739 -0.051868 -0.560363 0.052731 -0.101595 0.025929 -0.002883 0.012890 -0.029808 0.027076 0.021248 -0.003334 0.043880 0.005008
eigvec_13 0.043046 -0.058406 -0.069399 -0.008105 -0.273128 -0.081158 0.100693 0.143221 -0.359322 0.031940 -0.018578 0.040372 -0.058973 0.445001 -0.130728 0.692089 0.219839
eigvec_14 -0.133406 0.145498 -0.029590 -0.697723 0.617275 -0.009916 -0.020952 -0.038354 -0.003402 0.009439 0.003090 0.112056 -0.158910 0.020899 0.008418 0.227742 0.003394
eigvec_15 0.080633 0.033467 -0.085697 -0.107828 0.151742 -0.056373 0.019286 -0.034012 -0.058429 -0.066849 0.027529 -0.691126 0.671009 0.041374 -0.027154 0.073123 0.036477
eigvec_16 -0.595831 -0.292642 0.444638 -0.001023 -0.021884 0.523622 -0.125998 0.141856 0.069749 -0.011438 -0.039455 -0.127696 0.058313 0.017715 -0.104088 0.093746 0.069197
eigvec_17 0.024071 -0.145102 0.011143 0.038554 -0.089352 0.056177 -0.063536 -0.823444 0.354560 -0.028159 -0.039264 0.023222 0.016485 -0.011026 0.182661 0.325982 0.122107
In [18]:
from sklearn.decomposition import PCA
pca = PCA(n_components=8)
X_pca= pca.fit_transform(X_std)
In [19]:
X_pca.transpose()
Out[19]:
array([[-1.59285540e+00, -2.19240180e+00, -1.43096371e+00, ...,
        -7.32560596e-01,  7.91932735e+00, -4.69508066e-01],
       [ 7.67333510e-01, -5.78829984e-01, -1.09281889e+00, ...,
        -7.72352397e-02, -2.06832886e+00,  3.66660943e-01],
       [-1.01073537e-01,  2.27879812e+00, -4.38092811e-01, ...,
        -4.05641899e-04,  2.07356368e+00, -1.32891515e+00],
       ...,
       [-2.98306081e-01, -1.77137309e-01, -9.60591689e-01, ...,
         4.68014248e-01, -2.06993738e+00,  8.39893087e-01],
       [ 6.38443468e-01,  2.36753302e-01, -2.48276091e-01, ...,
        -1.31749158e+00,  8.33276555e-02,  1.30731260e+00],
       [-8.79386137e-01,  4.69253269e-02,  3.08740489e-01, ...,
        -1.28288447e-01, -5.52585842e-01,  6.27409633e-01]])
In [20]:
data_c = pd.DataFrame(pca.components_,columns = list(df_s))
data_c 
Out[20]:
Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
0 0.248766 0.207602 0.176304 0.354274 0.344001 0.154641 0.026443 0.294736 0.249030 0.064758 -0.042529 0.318313 0.317056 -0.176958 0.205082 0.318909 0.252316
1 0.331598 0.372117 0.403724 -0.082412 -0.044779 0.417674 0.315088 -0.249644 -0.137809 0.056342 0.219929 0.058311 0.046429 0.246665 -0.246595 -0.131690 -0.169241
2 -0.063092 -0.101249 -0.082986 0.035056 -0.024148 -0.061393 0.139682 0.046599 0.148967 0.677412 0.499721 -0.127028 -0.066038 -0.289848 -0.146989 0.226744 -0.208065
3 0.281311 0.267817 0.161827 -0.051547 -0.109767 0.100412 -0.158558 0.131291 0.184996 0.087089 -0.230711 -0.534725 -0.519443 -0.161189 0.017314 0.079273 0.269129
4 0.005741 0.055786 -0.055694 -0.395434 -0.426534 -0.043454 0.302385 0.222532 0.560919 -0.127289 -0.222311 0.140166 0.204720 -0.079388 -0.216297 0.075958 -0.109268
5 -0.016237 0.007535 -0.042558 -0.052693 0.033092 -0.043454 -0.191199 -0.030000 0.162755 0.641055 -0.331398 0.091256 0.154928 0.487046 -0.047340 -0.298119 0.216163
6 -0.042486 -0.012950 -0.027693 -0.161332 -0.118486 -0.025076 0.061042 0.108529 0.209744 -0.149692 0.633790 -0.001096 -0.028477 0.219259 0.243321 -0.226584 0.559944
7 -0.103090 -0.056271 0.058662 -0.122678 -0.102492 0.078890 0.570784 0.009846 -0.221453 0.213293 -0.232661 -0.077040 -0.012161 -0.083605 0.678524 -0.054159 -0.005336
In [21]:
correl = data_c.corr()
correl
Out[21]:
Apps Accept Enroll Top10perc Top25perc F.Undergrad P.Undergrad Outstate Room.Board Books Personal PhD Terminal S.F.Ratio perc.alumni Expend Grad.Rate
Apps 1.000000 0.976862 0.865244 0.389443 0.344998 0.788892 -0.268088 -0.108111 -0.066128 -0.328058 -0.124258 -0.084422 -0.175779 -0.034633 -0.360383 0.244745 0.010360
Accept 0.976862 1.000000 0.900902 0.225521 0.210724 0.838960 -0.164102 -0.203893 -0.104933 -0.421813 -0.174346 -0.058894 -0.149572 0.086361 -0.351331 0.095023 -0.010818
Enroll 0.865244 0.900902 1.000000 0.306888 0.323216 0.985887 0.130705 -0.435656 -0.490004 -0.325686 -0.055797 -0.009496 -0.099455 0.087549 -0.104308 0.037302 -0.133982
Top10perc 0.389443 0.225521 0.306888 1.000000 0.969762 0.259747 -0.343682 0.189781 -0.206397 0.299676 0.083594 0.197573 0.150896 -0.241697 0.180224 0.492554 0.184446
Top25perc 0.344998 0.210724 0.323216 0.969762 1.000000 0.290447 -0.347025 0.069418 -0.288854 0.313321 0.068945 0.302558 0.250222 -0.014050 0.224918 0.284028 0.259746
F.Undergrad 0.788892 0.838960 0.985887 0.259747 0.290447 1.000000 0.252337 -0.517323 -0.545002 -0.299667 0.000028 0.074912 -0.009071 0.119136 -0.106266 0.000471 -0.230232
P.Undergrad -0.268088 -0.164102 0.130705 -0.343682 -0.347025 0.252337 1.000000 -0.236924 -0.404850 -0.251802 0.056410 0.205443 0.236724 -0.240205 0.335388 0.058259 -0.561162
Outstate -0.108111 -0.203893 -0.435656 0.189781 0.069418 -0.517323 -0.236924 1.000000 0.720192 -0.300793 -0.155159 0.135027 0.142212 -0.540703 0.198058 0.591716 0.379277
Room.Board -0.066128 -0.104933 -0.490004 -0.206397 -0.288854 -0.545002 -0.404850 0.720192 1.000000 -0.247763 -0.073650 0.196583 0.211281 -0.167806 -0.448561 0.292063 0.180194
Books -0.328058 -0.421813 -0.325686 0.299676 0.313321 -0.299667 -0.251802 -0.300793 -0.247763 1.000000 -0.066285 -0.108586 -0.017300 0.057742 -0.120554 -0.006089 -0.321986
Personal -0.124258 -0.174346 -0.055797 0.083594 0.068945 0.000028 0.056410 -0.155159 -0.073650 -0.066285 1.000000 0.030626 -0.049266 -0.053862 -0.123147 0.006721 0.094284
PhD -0.084422 -0.058894 -0.009496 0.197573 0.302558 0.074912 0.205443 0.135027 0.196583 -0.108586 0.030626 1.000000 0.987833 0.263893 -0.034915 0.027374 -0.063158
Terminal -0.175779 -0.149572 -0.099455 0.150896 0.250222 -0.009071 0.236724 0.142212 0.211281 -0.017300 -0.049266 0.987833 1.000000 0.234930 -0.033977 0.040665 -0.145634
S.F.Ratio -0.034633 0.086361 0.087549 -0.241697 -0.014050 0.119136 -0.240205 -0.540703 -0.167806 0.057742 -0.053862 0.263893 0.234930 1.000000 -0.133006 -0.903388 0.303828
perc.alumni -0.360383 -0.351331 -0.104308 0.180224 0.224918 -0.106266 0.335388 0.198058 -0.448561 -0.120554 -0.123147 -0.034915 -0.033977 -0.133006 1.000000 -0.059557 0.392132
Expend 0.244745 0.095023 0.037302 0.492554 0.284028 0.000471 0.058259 0.591716 0.292063 -0.006089 0.006721 0.027374 0.040665 -0.903388 -0.059557 1.000000 -0.301228
Grad.Rate 0.010360 -0.010818 -0.133982 0.184446 0.259746 -0.230232 -0.561162 0.379277 0.180194 -0.321986 0.094284 -0.063158 -0.145634 0.303828 0.392132 -0.301228 1.000000
In [23]:
pca.explained_variance_
Out[23]:
array([5.45052162, 4.48360686, 1.17466761, 1.00820573, 0.93423123,
       0.84849117, 0.6057878 , 0.58787222])
In [20]:
pca.components_[0] 
Out[20]:
array([ 0.2487656 ,  0.2076015 ,  0.17630359,  0.35427395,  0.34400128,
        0.15464096,  0.0264425 ,  0.29473642,  0.24903045,  0.06475752,
       -0.04252854,  0.31831287,  0.31705602, -0.17695789,  0.20508237,
        0.31890875,  0.25231565])
In [22]:
print('The Linear eq of 1st component: ')
for i in range(0,data_c.shape[1]):
    print('{} * {}'.format(np.round(pca.components_[0][i],2),data_c.columns[i]),end=' + ')
The Linear eq of 1st component: 
0.25 * Apps + 0.21 * Accept + 0.18 * Enroll + 0.35 * Top10perc + 0.34 * Top25perc + 0.15 * F.Undergrad + 0.03 * P.Undergrad + 0.29 * Outstate + 0.25 * Room.Board + 0.06 * Books + -0.04 * Personal + 0.32 * PhD + 0.32 * Terminal + -0.18 * S.F.Ratio + 0.21 * perc.alumni + 0.32 * Expend + 0.25 * Grad.Rate + 
In [23]:
tot = sum(eig_vals) 
var_exp = [(i / tot)*100 for i in sorted(eig_vals, reverse=True)] 
cum_var_exp = np.cumsum(var_exp)
cum_var_exp
Out[23]:
array([ 32.0206282 ,  58.36084263,  65.26175919,  71.18474841,
        76.67315352,  81.65785448,  85.21672597,  88.67034731,
        91.78758099,  94.16277251,  96.00419883,  97.30024023,
        98.28599436,  99.13183669,  99.64896227,  99.86471628,
       100.        ])
In [56]:
fig = plt.figure(figsize=(8,5))
sing_vals = range(data_new.shape[1])
plt.plot(sing_vals, eig_vals, 'ro-', linewidth=2)
plt.plot(range(0,11),np.ones(11))
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()
In [33]:
data_news.describe().T
Out[33]:
count mean std min 25% 50% 75% max
Apps 777.0 6.355797e-17 1.000644 -0.755134 -0.575441 -0.373254 0.160912 11.658671
Accept 777.0 6.774575e-17 1.000644 -0.794764 -0.577581 -0.371011 0.165417 9.924816
Enroll 777.0 -5.249269e-17 1.000644 -0.802273 -0.579351 -0.372584 0.131413 6.043678
Top10perc 777.0 -2.753232e-17 1.000644 -1.506526 -0.712380 -0.258583 0.422113 3.882319
Top25perc 777.0 -1.546739e-16 1.000644 -2.364419 -0.747607 -0.090777 0.667104 2.233391
F.Undergrad 777.0 -1.661405e-16 1.000644 -0.734617 -0.558643 -0.411138 0.062941 5.764674
P.Undergrad 777.0 -3.029180e-17 1.000644 -0.561502 -0.499719 -0.330144 0.073418 13.789921
Outstate 777.0 6.515595e-17 1.000644 -2.014878 -0.776203 -0.112095 0.617927 2.800531
Room.Board 777.0 3.570717e-16 1.000644 -2.351778 -0.693917 -0.143730 0.631824 3.436593
Books 777.0 -2.192583e-16 1.000644 -2.747779 -0.481099 -0.299280 0.306784 10.852297
Personal 777.0 4.765243e-17 1.000644 -1.611860 -0.725120 -0.207855 0.531095 8.068387
PhD 777.0 5.954768e-17 1.000644 -3.962596 -0.653295 0.143389 0.756222 1.859323
Terminal 777.0 -4.481615e-16 1.000644 -3.785982 -0.591502 0.156142 0.835818 1.379560
S.F.Ratio 777.0 -2.057556e-17 1.000644 -2.929799 -0.654660 -0.123794 0.609307 6.499390
perc.alumni 777.0 -6.022638e-17 1.000644 -1.836580 -0.786824 -0.140820 0.666685 3.331452
Expend 777.0 1.213101e-16 1.000644 -1.240641 -0.557483 -0.245893 0.224174 8.924721
Grad.Rate 777.0 3.886495e-16 1.000644 -3.230876 -0.726019 -0.026990 0.730293 3.060392
In [34]:
plt.figure(figsize = (20,8))
data_new.boxplot()
Out[34]:
<AxesSubplot:>
In [35]:
plt.figure(figsize = (20,8))
data_news.boxplot()
Out[35]:
<AxesSubplot:>
In [25]:
from matplotlib.patches import Rectangle

fig,ax = plt.subplots(figsize=(22,10),facecolor='w',edgecolor='k')
ax = sns.heatmap(data_c,annot=True,vmax=1.0,cmap='Blues',cbar=False , yticklabels =['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8'])
column_max = data_c.abs().idxmax(axis=0)

for col,variable in enumerate(data_c.columns):
    position = data_c.index.get_loc(column_max[variable])
    ax.add_patch(Rectangle((col,position),1,1, fill= False, edgecolor='red',lw=3))
In [ ]: